

[-
    # Kernel Options:
    our ($beta, $bias, $relu, $prelu, $brelu, $bprelu, $bsum);

    # set externally
    our ($prefix, $prop, $shareI, $shareF, $stepI, $stepF, $remapI, $remapF);

    our $addr_shift = $prefix eq 's' ? 2 : 1;
    our $half = $prefix eq 'h';

    sub params
    {
        return <<'EOF';
    param_Sum[0]       : c[0x0][0x140]
    param_Sum[1]       : c[0x0][0x144]
    param_X[0]         : c[0x0][0x148]
    param_X[1]         : c[0x0][0x14c]
    param_O[0]         : c[0x0][0x150]
    param_O[1]         : c[0x0][0x154]
    param_I[0]         : c[0x0][0x158]
    param_I[1]         : c[0x0][0x15c]
    param_F[0]         : c[0x0][0x160]
    param_F[1]         : c[0x0][0x164]
    param_alpha        : c[0x0][0x168]
    param_beta         : c[0x0][0x16c]
    param_flags        : c[0x0][0x170]
    param_N            : c[0x0][0x174]
    param_K            : c[0x0][0x178]
    param_D            : c[0x0][0x17c]
    param_H            : c[0x0][0x180]
    param_W            : c[0x0][0x184]
    param_WN           : c[0x0][0x188]
    param_HWN          : c[0x0][0x18c]
    param_DHWN         : c[0x0][0x190]
    param_C            : c[0x0][0x194]
    param_KRST         : c[0x0][0x198]
    param_RST          : c[0x0][0x19c]
    param_RS           : c[0x0][0x1a0]
    param_T            : c[0x0][0x1a4]
    param_R            : c[0x0][0x1a8]
    param_S            : c[0x0][0x1ac]
    param_magic_RS     : c[0x0][0x1b0]
    param_shift_RS     : c[0x0][0x1b4]
    param_magic_S      : c[0x0][0x1b8]
    param_shift_S      : c[0x0][0x1bc]
    param_pad_d        : c[0x0][0x1c0]
    param_pad_h        : c[0x0][0x1c4]
    param_pad_w        : c[0x0][0x1c8]
    param_str_d        : c[0x0][0x1cc]
    param_str_h        : c[0x0][0x1d0]
    param_str_w        : c[0x0][0x1d4]
    param_dil_d        : c[0x0][0x1d8]
    param_dil_h        : c[0x0][0x1dc]
    param_dil_w        : c[0x0][0x1e0]
    param_P2           : c[0x0][0x1e4]
    param_Q            : c[0x0][0x1e8]
    param_PQk          : c[0x0][0x1ec]
    param_Qk           : c[0x0][0x1f0]
    param_k            : c[0x0][0x1f4]
    param_magic_PQk    : c[0x0][0x1f8]
    param_shift_PQk    : c[0x0][0x1fc]
    param_magic_Qk     : c[0x0][0x200]
    param_shift_Qk     : c[0x0][0x204]
    param_magic_k      : c[0x0][0x208]
    param_shift_k      : c[0x0][0x20c]
    param_QN           : c[0x0][0x210]
    param_PQN          : c[0x0][0x214]
    param_MPQN         : c[0x0][0x218]
    param_gridN        : c[0x0][0x21c]
    param_gridQN       : c[0x0][0x220]
    param_gridPQN      : c[0x0][0x224]
    param_gridMPQN     : c[0x0][0x228]
    param_magic_str_d  : c[0x0][0x22c]
    param_shift_str_d  : c[0x0][0x230]
    param_magic_str_h  : c[0x0][0x234]
    param_shift_str_h  : c[0x0][0x238]
    param_magic_str_w  : c[0x0][0x23c]
    param_shift_str_w  : c[0x0][0x240]
EOF
    }

    sub get_mpqk
    {
        return <<'EOF';
// idx_M = idx_MPQk / blk_PQk
--:-:-:-:1      MOV  magic_PQk, param_magic_PQk;
--:-:-:-:1      ISETP.NE.AND P1, PT,   magic_PQk, 1, PT;
02:-:-:-:1  @P1 XMAD     div1, idx_MPQk,    magic_PQk,    RZ;
--:-:-:-:1  @P1 XMAD     div2, idx_MPQk,    magic_PQk.H1, RZ;
--:-:-:-:1  @P1 XMAD     div3, idx_MPQk.H1, magic_PQk.H1, RZ;
--:-:-:-:1  @P1 XMAD.CHI div1, idx_MPQk.H1, magic_PQk,    div1;
--:-:-:-:1  @P1 IADD3.RS idx_M, div1, div2, div3;
--:-:-:-:1  @P1 SHR.U32  idx_M, idx_M,    param_shift_PQk;
--:-:-:-:1 @!P1 SHR.U32  idx_M, idx_MPQk, param_shift_PQk;

// idx_PQk = idx_PQk % blk_Qk
--:-:-:-:1      IADD neg_PQk, RZ, -param_PQk;
--:-:-:-:1      XMAD.LO2 idx_PQk, neg_PQk, idx_M, idx_MPQk;

// idx_P2 = idx_PQk / blk_Qk
--:-:-:-:1      MOV  magic_Qk, param_magic_Qk;
--:-:-:-:1      ISETP.NE.AND P2, PT,  magic_Qk, 1, PT;
--:-:-:-:1  @P2 XMAD     div1, idx_PQk,    magic_Qk,    RZ;
--:-:-:-:1  @P2 XMAD     div2, idx_PQk,    magic_Qk.H1, RZ;
--:-:-:-:1  @P2 XMAD     div3, idx_PQk.H1, magic_Qk.H1, RZ;
--:-:-:-:1  @P2 XMAD.CHI div1, idx_PQk.H1, magic_Qk,    div1;
--:-:-:-:1  @P2 IADD3.RS idx_P2, div1, div2, div3;
--:-:-:-:1  @P2 SHR.U32  idx_P2, idx_P2,  param_shift_Qk;
--:-:-:-:1 @!P2 SHR.U32  idx_P2, idx_PQk, param_shift_Qk;

// idx_Qk = idx_PQk % blk_Qk
--:-:-:-:1      IADD neg_Qk, RZ, -param_Qk;
--:-:-:-:1      XMAD.LO2 idx_Qk, neg_Qk, idx_P2, idx_PQk;

// idx_Q2  = idx_Qk / k
--:-:-:-:1      XMAD.LO2C idx_Q2, idx_Qk, param_magic_k, RZ;
--:-:-:-:1      SHR.U32   idx_Q2, idx_Q2, param_shift_k;
// idx_k = idx_Qk % k
--:-:-:-:1      IADD neg_k, RZ, -param_k;
--:-:-:-:1      XMAD.S16.U16  idx_k, neg_k, idx_Q2, idx_Qk;

// idx_K = idx_K * blk_k + idx_k
04:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;

// Implement a square wave block id remapping (for all but last row (if odd number of rows))
// idx_P = idx_P2 * 2
// idx_Q = idx_Q2
// if idx_P2 != gridP2:
//     idx_P += (idx_Q2 & 1) ^ ((idx_Q2 & 2)>>1)
//     idx_Q  = idx_Q2 >> 1
--:-:-:-:1      ISETP.NE.AND P1, PT, idx_P2, param_P2, PT;
--:-:-:-:1      SHL idx_P, idx_P2, 1;
--:-:-:-:1  @P1 LOP.AND q1, idx_Q2, 1;
--:-:-:-:1  @P1 BFE.U32 q2, idx_Q2, 0x101; // 1 bit at position 1
--:-:-:-:1  @P1 LOP.XOR q1, q1, q2;
--:-:-:-:1  @P1 IADD idx_P, idx_P, q1;
--:-:-:-:1  @P1 SHR.U32 idx_Q, idx_Q2, 1;
--:-:-:-:1 @!P1 MOV idx_Q, idx_Q2;

// Scan backwards on odd rows
// if idx_P2 & 1:
//     idx_Q = Q - idx_Q - 1
--:-:-:-:1      LOP.AND.NZ P2, RZ, idx_P2, 1;
--:-:-:-:1      MOV negOne, -1;
--:-:-:-:1  @P2 IADD3 idx_Q, -idx_Q, param_Q, negOne;

EOF
    }

    sub load_zeros
    {
        return  "--:-:-:-:1      STS.128 [addr_zero], RZ;\n" .
                join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15;
    }

    sub begin_lut
    {
        return <<'EOF';
--:-:-:-:5  @P0 BRA.U END_SETUP;

<SCHEDULE_BLOCK>
--:-:-:-:1      STS.128 [addr_mpqk], mpqk;

--:-:-:-:1      MOV rst,        tid;
--:-:-:-:1      MOV lutStore2,  RZ;
--:-:-:-:1      MOV lutSize,    RZ;
--:-:-:-:1      MOV warp_count, 32;

--:-:-:-:1      IADD    mask_shr, -tid, 32;
--:-:-:-:1      SHR.U32 dep_thd_mask, negOne, mask_shr;
EOF
    }

    sub end_lut
    {
        return sprintf <<'EOF', $addr_shift;
<ORDERED>
// Get a mask of all valid slices in the warp
--:-:-:-:1      VOTE.ANY ballot, PT, P1;
// Count the total valid slices
--:-:2:-:1      POPC warp_slices, ballot;
// Prepare lutStore for this and next loop
--:-:-:-:1  @P1 MOV    lutStore, lutStore2;
02:-:-:-:1      ISCADD lutStore2, warp_slices, lutStore2, 3;
// Count all the valid slices below this threadid
--:-:-:-:1  @P1 LOP.AND dep_thd_bits, dep_thd_mask, ballot;
--:-:3:-:1  @P1 POPC dep_thd_cnt, dep_thd_bits;
// use the rst increment to space the barrier sync
--:-:-:-:1      IADD rst, rst, 32;
// Update the lutStore address from this count
04:-:-:-:1  @P1 ISCADD lutStore, dep_thd_cnt, lutStore, 3;
// Store both slice offsets in the lut
--:1:-:-:1  @P1 STS.64 [lutStore + addr_lut], sliceIF;
</ORDERED>
// Keep track of the total size of the lut
--:-:-:-:1      IADD lutSize, lutSize, warp_slices;
</SCHEDULE_BLOCK>

--:-:-:-:5  @P0 BRA.U LUT_LOOP;

// Share the lut size with the other warp
--:1:-:-:2      STS [addr_szLut], lutSize;

END_SETUP:

01:-:-:-:5      BAR.SYNC 0;

// Grab the caclulated lut size and get it's reciprical
// Get the total reduction depth
--:-:1:-:2      LDS lutSize, [addr_szLut];
01:-:-:-:0      XMAD endCRST, lutSize, param_C, RZ;
--:-:1:-:2      I2F.F32.S32 lutSizeRcp, lutSize;
01:-:1:-:1      MUFU.RCP lutSizeRcp, lutSizeRcp;

<SCHEDULE_BLOCK>
// lutSize != 0
--:-:-:-:1      LOP.AND.NZ P0, RZ, lutSize, -1;
// posCRST = endCRST - tidY - 1
--:-:-:-:1      IADD3 posCRST, endCRST, -1, -tidY;
// If this value is not a multiple of 8 we want to grab the partial amount on the first fetch.
// If it is a multiple of 8 then make a full 8 line fetch.
--:-:-:-:1      LOP.AND.Z P1, partial, endCRST, 7;
--:-:-:-:1  @P1 MOV partial, 8;
// channel = posCRST / lutSize
// Add an epsilon scaled to the size of the channel estimate then recompute and truncate it
--:-:2:-:1      I2F.F32.S32 posCRSTf, posCRST;
03:-:-:-:1      FMUL channel, posCRSTf, lutSizeRcp;
--:-:-:-:1      FFMA channel, channel, 5.9604644775390625e-08, channel;
--:-:2:-:1      F2I.S32.F32.TRUNC channel, channel;
// lutOffset = (posCRST % lutSize) * 8
02:-:-:-:1      VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;
--:-:-:-:1      SHL lutOffset, lutOffset, 3;
// P1 = tidY < partial &&
--:-:-:-:1      ISETP.LT.AND P1, PT, tidY, partial, P0;
// offsetIC = channel * DHWN
// offsetFC = channel * K
--:-:-:-:1      XMAD.LO2C offsetIc, channel, param_DHWN, RZ;
--:-:-:-:1      XMAD      offsetFc, channel, param_KRST, RZ;
// posCRST -= partial
--:-:-:-:1      IADD posCRST, posCRST, -partial;
--:-:1:-:2  @P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];
</SCHEDULE_BLOCK>

// trackI = offsetIN + offsetIC + sliceI + param_I
// trackF = offsetFK + offsetFC + sliceF + param_F
01:-:-:-:1  @P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;
--:-:-:-:5  @P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;
--:-:-:-:6  @P1 LEA      trackF0.CC, offsetF, param_F[0],     %1$s;
--:-:-:-:1  @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, %1$s;
--:-:-:-:6  @P1 LEA      trackI0.CC, offsetI, param_I[0],     %1$s;
--:-:-:-:0  @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, %1$s;
EOF
    }

    sub fprop_lut
    {
        return begin_lut() . <<'EOF' . end_lut();
// mt = m * w - pad_d
// pr = p * u - pad_h
// qs = q * v - pad_w
--:-:-:-:1      XMAD qs, q,   param_str_w, RZ;
--:-:-:-:1      XMAD pr, p,   param_str_h, RZ;
--:-:-:-:1      XMAD mt, m,   param_str_d, RZ;
--:-:-:-:1      IADD qs, qs, -param_pad_w;
--:-:-:-:1      IADD pr, pr, -param_pad_h;
--:-:-:-:1      IADD mt, mt, -param_pad_d;
</SCHEDULE_BLOCK>

LUT_LOOP:

<SCHEDULE_BLOCK>
// warp synchronous loop while warp_count < RST
--:-:-:-:1      ISETP.LT.AND P0, PT, warp_count, param_RST, PT;
--:-:-:-:1      ISETP.LT.AND P6, PT, rst, param_RST, PT;

--:-:-:-:1      IADD warp_count, warp_count, 32;
// t =  rst / RS
// rs = rst % RS
--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
--:-:-:-:1      IADD  rs, -rs, rst;
// r = rs / S
// s = rs % S
--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
--:-:-:-:1      SHR.U32   r, r, param_shift_S;
--:-:-:-:1      XMAD   s, r, param_S, RZ;
--:-:-:-:1      IADD   s, -s, rs;
// x = qs + (s * dil_w)
// y = pr + (r * dil_h)
// z = mt + (t * dil_d)
--:-:-:-:1      XMAD x, s, param_dil_w, qs;
--:-:-:-:1      XMAD y, r, param_dil_h, pr;
--:-:-:-:1      XMAD z, t, param_dil_d, mt;
--:-:-:-:1      ISETP.GE.AND  P4, PT, x, RZ, PT;
--:-:-:-:1      ISETP.GE.AND  P5, PT, y, RZ, PT;
--:-:-:-:1      ISETP.GE.AND  P6, PT, z, RZ, P6;
--:-:-:-:1      ISETP.LT.AND  P4, PT, x, param_W, P4;
--:-:-:-:1      ISETP.LT.AND  P5, PT, y, param_H, P5;
--:-:-:-:1      ISETP.LT.AND  P6, PT, z, param_D, P6;
--:-:-:-:1      PSETP.AND.AND P1, PT, P4, P5, P6;

<ORDERED>
// sliceI = z*HWN + y*WN + x*N
01:-:-:-:1      XMAD      sliceI, x, param_N,   RZ;
--:-:-:-:1      XMAD.LO2C sliceI, y, param_WN,  sliceI;
--:-:-:-:1      XMAD.LO2C sliceI, z, param_HWN, sliceI;
// sliceF = rst * K
--:-:-:-:1      XMAD sliceF, rst, param_K, RZ;
</ORDERED>
EOF
    }

    sub bprop_lut
    {
        return begin_lut() . <<'EOF' . end_lut();
--:-:-:-:1      MOV str_d, param_str_d;
--:-:-:-:1      MOV str_h, param_str_h;
--:-:-:-:1      MOV str_w, param_str_w;
// qs = q - pad_w
// pr = p - pad_h
// mt = m - pad_d
--:-:-:-:1      IADD qs, q, -param_pad_w;
--:-:-:-:1      IADD pr, p, -param_pad_h;
--:-:-:-:1      IADD mt, m, -param_pad_d;
</SCHEDULE_BLOCK>

LUT_LOOP:

<SCHEDULE_BLOCK>
// warp synchronous loop while warp_count < RST
--:-:-:-:1      ISETP.LT.AND P0, PT, warp_count, param_RST, PT;
--:-:-:-:1      ISETP.LT.AND P6, PT, rst, param_RST, PT;
--:-:-:-:1      IADD warp_count, warp_count, 32;
// t =  rst / RS
// rs = rst % RS
--:-:-:-:1      XMAD.LO2C t, rst, param_magic_RS, RZ;
--:-:-:-:1      SHR.U32   t, t, param_shift_RS;
--:-:-:-:1      XMAD  rs, t, param_RS, RZ;
--:-:-:-:1      IADD  rs, -rs, rst;
// r = rs / S
// s = rs % S
--:-:-:-:1      XMAD.LO2C r, rs, param_magic_S, RZ;
--:-:-:-:1      SHR.U32   r, r, param_shift_S;
--:-:-:-:1      XMAD   s, r, param_S, RZ;
--:-:-:-:1      IADD   s, -s, rs;
// x = qs + (s * dil_w)
// y = pr + (r * dil_h)
// z = mt + (t * dil_d)
--:-:-:-:1      XMAD x, s, param_dil_w, qs;
--:-:-:-:1      XMAD y, r, param_dil_h, pr;
--:-:-:-:1      XMAD z, t, param_dil_d, mt;
--:-:-:-:1      ISETP.GE.AND  P4, PT, x, RZ, PT;
--:-:-:-:1      ISETP.GE.AND  P5, PT, y, RZ, PT;
--:-:-:-:1      ISETP.GE.AND  P6, PT, z, RZ, P6;
// x_prime = x / str_w
// x       = x % str_w
--:-:-:-:1      XMAD    x_prime, x, param_magic_str_w, RZ;
--:-:-:-:1      SHR.U32 x_prime, x_prime, param_shift_str_w;
--:-:-:-:1      VMAD.U16.U16 x, -x_prime, str_w, x;
// y_prime = y / str_h
// y       = y % str_h
--:-:-:-:1      XMAD    y_prime, y, param_magic_str_h, RZ;
--:-:-:-:1      SHR.U32 y_prime, y_prime, param_shift_str_h;
--:-:-:-:1      VMAD.U16.U16 y, -y_prime, str_h, y;
// z_prime = z / str_d
// z       = z % str_d
--:-:-:-:1      XMAD    z_prime, z, param_magic_str_d, RZ;
--:-:-:-:1      SHR.U32 z_prime, z_prime, param_shift_str_d;
--:-:-:-:1      VMAD.U16.U16 z, -z_prime, str_d, z;

--:-:-:-:1      ISETP.EQ.AND  P4, PT, x, RZ, P4;
--:-:-:-:1      ISETP.EQ.AND  P5, PT, y, RZ, P5;
--:-:-:-:1      ISETP.EQ.AND  P6, PT, z, RZ, P6;
--:-:-:-:1      ISETP.LT.AND  P4, PT, x_prime, param_W, P4;
--:-:-:-:1      ISETP.LT.AND  P5, PT, y_prime, param_H, P5;
--:-:-:-:1      ISETP.LT.AND  P6, PT, z_prime, param_D, P6;
--:-:-:-:1      PSETP.AND.AND P1, PT, P4, P5, P6;

// sliceI = z_prime*HWN + y_prime*WN + x_prime*N
01:-:-:-:1      XMAD      sliceI, x_prime, param_N,   RZ;
--:-:-:-:1      XMAD.LO2C sliceI, y_prime, param_WN,  sliceI;
--:-:-:-:1      XMAD.LO2C sliceI, z_prime, param_HWN, sliceI;
// sliceF = rst_prime * K
01:-:-:-:1      XMAD sliceF, rst, param_K, RZ;
EOF
    }

    sub load_lut
    {
        return $prop eq 'f' ? fprop_lut() : bprop_lut();
    }

    sub loop_setup
    {
        my $swap;
        if ($shareI == $shareF)
        {
            $swap = <<'EOF';
--:-:-:-:0      LOP.XOR writeS, writeS, 4x<szShareF + szShareI>;
EOF
        }
        else
        {
            $swap = <<'EOF';
--:-:-:-:1      IADD writeIs, writeIs, swapBuf;
--:-:-:-:1      IADD writeFs, writeFs, swapBuf;
--:-:-:-:0      IADD swapBuf, RZ,     -swapBuf;
EOF
        }
        return sprintf <<'EOF', $shareI, $shareF, $stepI, $stepF, $addr_shift, $swap;

--:-:-:-:0      ISETP.GE.AND P1, PT, posCRST, RZ, PT;
--:-:2:-:1      I2F.F32.S32 posCRSTf, posCRST;

01:-:-:-:5      BAR.SYNC 0;
%6$s

--:-:-:-:1      LDS.U.128 j0Ix0, [readIs + 4x<0*%1$-3s + 00>];
--:-:-:-:1      LDS.U.128 j0Fy0, [readFs + 4x<0*%2$-3s + 00>];
--:-:-:-:1      LDS.U.128 j0Ix4, [readIs + 4x<0*%1$-3s + %3$s>];
--:-:1:-:2      LDS.U.128 j0Fy4, [readFs + 4x<0*%2$-3s + %4$s>];

<SCHEDULE_BLOCK>
// channel = posCRST / lutSize
02:-:-:-:1  @P1 FMUL channel, posCRSTf, lutSizeRcp;
--:-:-:-:1  @P1 FFMA channel, channel, 5.9604644775390625e-08, channel;
--:-:2:-:1  @P1 F2I.S32.F32.TRUNC channel, channel;
// lutOffset = (posCRST % lutSize) * 8
02:-:-:-:1  @P1 VMAD.U16.U16 lutOffset, -channel, lutSize, posCRST;
--:-:-:-:1  @P1 SHL lutOffset, lutOffset, 3;
// offsetIC = channel * DHWN
// offsetFC = channel * K
--:-:-:-:1  @P1 XMAD.LO2C offsetIc, channel, param_DHWN, RZ;
--:-:-:-:1  @P1 XMAD      offsetFc, channel, param_KRST, RZ;

--:-:-:-:1      IADD posCRST, posCRST, -8;
--:-:2:-:2  @P1 LDS.U.64 sliceIF, [lutOffset + addr_lut];
</SCHEDULE_BLOCK>

// trackI = offsetIN + offsetIC + sliceI + param_I
// trackF = offsetFK + offsetFC + sliceF + param_F
02:-:-:-:1  @P1 IADD3 offsetF, offsetFk, offsetFc, sliceF;
--:-:-:-:5  @P1 IADD3 offsetI, offsetIn, offsetIc, sliceI;
--:-:-:-:6  @P1 LEA      trackF0.CC, offsetF, param_F[0],     %5$s;
--:-:-:-:1  @P1 LEA.HI.X trackF1,    offsetF, param_F[1], RZ, %5$s;
--:-:-:-:6  @P1 LEA      trackI0.CC, offsetI, param_I[0],     %5$s;
--:-:-:-:0  @P1 LEA.HI.X trackI1,    offsetI, param_I[1], RZ, %5$s;
EOF
    }

    sub main_loop
    {
        our %insert;
        my @cOrder;
        my @swirl = ([0,2],[1,2],[1,0],[0,0]);
        my @y = (0,1,4,5);
        foreach my $x (0,2,4,6)
        {
            foreach my $y (@y)
            {
                push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
            }
            @y = reverse @y;
        }
        my $out;
        foreach my $j (0 .. 7)
        {
            my $odd      = $j & 1;
            my $nOdd     = !$odd + 0;
            my $rsOffset = ($j + 1) % 8;
            my $rsPred   = $j == 7 ? '@P0' : '   ';

            $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx0, [readIs + 4x<%d*%-3s + 00>];\n", $rsPred, $nOdd, $rsOffset, $shareI;
            $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dFy0, [readFs + 4x<%d*%-3s + 00>];\n", $rsPred, $nOdd, $rsOffset, $shareF;
            $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dIx4, [readIs + 4x<%d*%-3s + %s>];\n", $rsPred, $nOdd, $rsOffset, $shareI, $stepI;
            $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dFy4, [readFs + 4x<%d*%-3s + %s>];\n", $rsPred, $nOdd, $rsOffset, $shareF, $stepF;

            foreach my $c (0 .. 63)
            {
                my ($x,$y) = @{$cOrder[$c]};

                my $ins    = $insert{"j${j}c$c"} || '';

                my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;

                my $yield  = $c == 32 && $stall ? 'Y' : '-';

                my $wait   = $c == 0 ? '01' : '--';

                my $ctrl   = "$wait:-:-:$yield:$stall";

                $out .= sprintf "%s      FFMA cx%dy%d, j%dIx%d, j%dFy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
            }
        }
        return $out;
    }


    sub output_setup
    {
        my ($tidOX, $warp_shift, $bsum_shift) = @_;
        my $out;

        $out .= qq{
02:-:-:-:1      SHR.U32   bsum_offset, tidOX, $bsum_shift;
04:-:-:-:1      ISCADD    bsum_offset, idx_N, bsum_offset,   $warp_shift;
01:-:-:-:1      XMAD      bsum_offset, idx_Q, param_gridN,   bsum_offset;
--:-:-:-:1      XMAD.LO2C bsum_offset, idx_P, param_gridQN,  bsum_offset;
--:-:-:-:1      XMAD.LO2C bsum_offset, idx_M, param_gridPQN, bsum_offset;

--:-:-:-:1      LOP.AND.Z P5, RZ, tidOX, $tidOX;
        } if $bsum;

        $out .= qq{
// out_offset = m*PQN + p*QN + q*N + n
01:-:-:-:1      XMAD      out_offset, q, param_N,    n;
--:-:-:-:1      XMAD.LO2C out_offset, p, param_QN,   out_offset;
--:-:-:-:1      XMAD.LO2C out_offset, m, param_PQN,  out_offset;

--:-:-:-:1      MOV alpha, param_alpha;
--:-:-:-:1      MOV32I one, 1.0;

--:-:-:-:1      ISETP.EQ.AND P2, PT, RZ, param_flags, PT; // no output
--:-:-:-:1      ISETP.LT.AND P0, PT, n, param_N, P2;
        };

        $out .=  $half ? q{
--:-:-:-:1      ISETP.LT.AND P1, PT, n, param_N, P2;
        } : qq{
--:-:-:-:1      IADD n, n, $stepI;
--:-:-:-:1      ISETP.LT.AND P1, PT, n, param_N, P2;
        };
        return $out;
    }

    sub output
    {
        my $out = q{
--:-:-:-:5      BAR.SYNC 0;
        };

        foreach my $y (0..7)
        {
            my $incK  = $y == 4 && !$remapF ? $stepF-3 : 1;
            my $stepK = $y ? "\n--:-:-:-:1      IADD k, k, $incK;" : "";

            $out .= qq{$stepK
--:-:-:-:1      FMUL cs0, cx0y$y, alpha;
--:-:-:-:1      FMUL cs1, cx1y$y, alpha;
--:-:-:-:1      FMUL cs2, cx2y$y, alpha;
--:-:-:-:1      FMUL cs3, cx3y$y, alpha;
--:-:-:-:1      FMUL cs4, cx4y$y, alpha;
--:-:-:-:1      FMUL cs5, cx5y$y, alpha;
--:-:-:-:1      FMUL cs6, cx6y$y, alpha;
--:-:-:-:0      FMUL cs7, cx7y$y, alpha;
--:-:-:-:5      CAL STORE_O;
            };
        }
        $out .= q{

--:-:-:-:5      EXIT;

STORE_O:

<SCHEDULE_BLOCK>
30:-:-:-:1      XMAD offset, k, param_MPQN, out_offset;
--:-:-:-:1      XMAD.PSL offset, k, param_MPQN.H1, offset;
--:-:-:-:1      ISETP.LT.AND P2, PT, k, param_K, P0; // k < K && n < N
--:-:-:-:1      ISETP.LT.AND P3, PT, k, param_K, P1; // k < K && n < N
        };

        if ($beta || $brelu || $bprelu)
        {
            $out .= qq{
--:-:-:-:1      LEA      Out0.CC, offset, param_X[0],     $addr_shift;
--:-:-:-:1      LEA.HI.X Out1,    offset, param_X[1], RZ, $addr_shift;
            };
            $out .= $half ? q{
--:-:5:-:2  @P2 LDG.E.128 b0, [Out];
            } : q{
--:-:5:-:1  @P2 LDG.E.128 b0, [Out + 4x<00>];
--:-:6:-:1  @P3 LDG.E.128 b4, [Out + 4x<$stepI>];
            };
        }

        $out .= q{
--:-:-:-:1      LEA      Sum0.CC, k, param_Sum[0],     2;
--:-:-:-:1      LEA.HI.X Sum1,    k, param_Sum[1], RZ, 2;

--:-:6:-:1  @P2 LDG.E.CI b0, [Sum];
--:-:-:-:1 @!P2 MOV b0, RZ;
        } if $bias;

        $out .= q{
<ORDERED>
--:-:-:-:1      STS.128 [writeCs + 4x<00>], cs0;
--:-:-:-:1      STS.128 [writeCs + 4x<$remapI ? 4 : $stepI>], cs4;
--:-:1:-:1  @P2 LDS.U.128 out0, [readCs + 4x<00>];
--:-:2:-:1  @P3 LDS.U.128 out4, [readCs + 4x<$half ? 4 : $stepI>];
</ORDERED>
</SCHEDULE_BLOCK>
<SCHEDULE_BLOCK>
        };

        $out .= q{
21:-:-:-:1      FADD out0, out0, b0;
--:-:-:-:1      FADD out1, out1, b0;
--:-:-:-:1      FADD out2, out2, b0;
--:-:-:-:1      FADD out3, out3, b0;
02:-:-:-:1      FADD out4, out4, b0;
--:-:-:-:1      FADD out5, out5, b0;
--:-:-:-:1      FADD out6, out6, b0;
--:-:-:-:1      FADD out7, out7, b0;
        } if $bias;

        $out .= q{
01:-:-:-:1      FMNMX out0, out0, RZ, !PT;
--:-:-:-:1      FMNMX out1, out1, RZ, !PT;
--:-:-:-:1      FMNMX out2, out2, RZ, !PT;
--:-:-:-:1      FMNMX out3, out3, RZ, !PT;
02:-:-:-:1      FMNMX out4, out4, RZ, !PT;
--:-:-:-:1      FMNMX out5, out5, RZ, !PT;
--:-:-:-:1      FMNMX out6, out6, RZ, !PT;
--:-:-:-:1      FMNMX out7, out7, RZ, !PT;
        } if $relu;

        $out .= q{
// maximum(x, 0) + slope * minimum(0, x)
01:-:-:-:1      FMNMX b0, out0, RZ, !PT;
--:-:-:-:1      FMNMX b1, out1, RZ, !PT;
--:-:-:-:1      FMNMX b2, out2, RZ, !PT;
--:-:-:-:1      FMNMX b3, out3, RZ, !PT;
02:-:-:-:1      FMNMX b4, out4, RZ, !PT;
--:-:-:-:1      FMNMX b5, out5, RZ, !PT;
--:-:-:-:1      FMNMX b6, out6, RZ, !PT;
--:-:-:-:1      FMNMX b7, out7, RZ, !PT;

--:-:-:-:1      FMNMX x0, out0, RZ, PT;
--:-:-:-:1      FMNMX x1, out1, RZ, PT;
--:-:-:-:1      FMNMX x2, out2, RZ, PT;
--:-:-:-:1      FMNMX x3, out3, RZ, PT;
--:-:-:-:1      FMNMX x4, out4, RZ, PT;
--:-:-:-:1      FMNMX x5, out5, RZ, PT;
--:-:-:-:1      FMNMX x6, out6, RZ, PT;
--:-:-:-:1      FMNMX x7, out7, RZ, PT;

--:-:-:-:1      FFMA out0, x0, param_beta, b0;
--:-:-:-:1      FFMA out1, x1, param_beta, b1;
--:-:-:-:1      FFMA out2, x2, param_beta, b2;
--:-:-:-:1      FFMA out3, x3, param_beta, b3;
--:-:-:-:1      FFMA out4, x4, param_beta, b4;
--:-:-:-:1      FFMA out5, x5, param_beta, b5;
--:-:-:-:1      FFMA out6, x6, param_beta, b6;
--:-:-:-:1      FFMA out7, x7, param_beta, b7;
        } if $prelu;

        $out .= q{
</SCHEDULE_BLOCK>
        };

        $out .= q{
13:-:-:-:1  @P2 F2F.F32.F16 b7, b3.H1;
--:-:-:-:1  @P2 F2F.F32.F16 b6, b3.H0;
--:-:-:-:1  @P2 F2F.F32.F16 b5, b2.H1;
--:-:-:-:1  @P2 F2F.F32.F16 b4, b2.H0;
--:-:-:-:1  @P2 F2F.F32.F16 b3, b1.H1;
--:-:-:-:1  @P2 F2F.F32.F16 b2, b1.H0;
--:-:-:-:1  @P2 F2F.F32.F16 b1, b0.H1;
--:-:5:-:2  @P2 F2F.F32.F16 b0, b0.H0;
        } if $half && ($beta || $brelu || $bprelu);

        $out .= q{
<SCHEDULE_BLOCK>
        };

        $out .= q{
11:-:-:-:1  @P2 FFMA out0, b0, param_beta, out0;
--:-:-:-:1  @P2 FFMA out1, b1, param_beta, out1;
--:-:-:-:1  @P2 FFMA out2, b2, param_beta, out2;
--:-:-:-:1  @P2 FFMA out3, b3, param_beta, out3;
22:-:-:-:1  @P3 FFMA out4, b4, param_beta, out4;
--:-:-:-:1  @P3 FFMA out5, b5, param_beta, out5;
--:-:-:-:1  @P3 FFMA out6, b6, param_beta, out6;
--:-:-:-:1  @P3 FFMA out7, b7, param_beta, out7;
        } if $beta;

        $out .= q{
//delta *= (x > 0)
--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
11:-:-:-:1      FSETP.GT.AND P0, PT, b0, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P1, PT, b1, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P2, PT, b2, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P3, PT, b3, RZ, PT;
--:-:-:-:1 @!P0 MOV out0, RZ;
--:-:-:-:1 @!P1 MOV out1, RZ;
--:-:-:-:1 @!P2 MOV out2, RZ;
--:-:-:-:1 @!P3 MOV out3, RZ;
22:-:-:-:1      FSETP.GT.AND P0, PT, b4, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P1, PT, b5, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P2, PT, b6, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P3, PT, b7, RZ, PT;
--:-:-:-:1 @!P0 MOV out4, RZ;
--:-:-:-:1 @!P1 MOV out5, RZ;
--:-:-:-:1 @!P2 MOV out6, RZ;
--:-:-:-:1 @!P3 MOV out7, RZ;
--:-:-:-:5      R2P PR, preds, 0x0f;
        } if $brelu;

        $out .= q{
//delta *= ((x > 0) + slope * (x < 0))
--:-:-:-:1      P2R preds, PR, RZ, 0x0f;
11:-:-:-:1      FSETP.GT.AND P0, PT, b0, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P1, PT, b1, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P2, PT, b2, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P3, PT, b3, RZ, PT;
--:-:-:-:1      SEL x0, one, RZ, P0;
--:-:-:-:1      SEL x1, one, RZ, P1;
--:-:-:-:1      SEL x2, one, RZ, P2;
--:-:-:-:1      SEL x3, one, RZ, P3;
--:-:-:-:1      FSETP.LT.AND P0, PT, b0, RZ, PT;
--:-:-:-:1      FSETP.LT.AND P1, PT, b1, RZ, PT;
--:-:-:-:1      FSETP.LT.AND P2, PT, b2, RZ, PT;
--:-:-:-:1      FSETP.LT.AND P3, PT, b3, RZ, PT;
--:-:-:-:1      SEL b0, one, RZ, P0;
--:-:-:-:1      SEL b1, one, RZ, P1;
--:-:-:-:1      SEL b2, one, RZ, P2;
--:-:-:-:1      SEL b3, one, RZ, P3;
--:-:-:-:1      FFMA b0, b0, param_beta, x0;
--:-:-:-:1      FFMA b1, b1, param_beta, x1;
--:-:-:-:1      FFMA b2, b2, param_beta, x2;
--:-:-:-:1      FFMA b3, b3, param_beta, x3;
--:-:-:-:1      FMUL out0, out0, b0;
--:-:-:-:1      FMUL out1, out1, b1;
--:-:-:-:1      FMUL out2, out2, b2;
--:-:-:-:1      FMUL out3, out3, b3;
22:-:-:-:1      FSETP.GT.AND P0, PT, b4, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P1, PT, b5, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P2, PT, b6, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P3, PT, b7, RZ, PT;
--:-:-:-:1      SEL x4, one, RZ, P0;
--:-:-:-:1      SEL x5, one, RZ, P1;
--:-:-:-:1      SEL x6, one, RZ, P2;
--:-:-:-:1      SEL x7, one, RZ, P3;
--:-:-:-:1      FSETP.LT.AND P0, PT, b4, RZ, PT;
--:-:-:-:1      FSETP.LT.AND P1, PT, b5, RZ, PT;
--:-:-:-:1      FSETP.LT.AND P2, PT, b6, RZ, PT;
--:-:-:-:1      FSETP.LT.AND P3, PT, b7, RZ, PT;
--:-:-:-:1      SEL b4, one, RZ, P0;
--:-:-:-:1      SEL b5, one, RZ, P1;
--:-:-:-:1      SEL b6, one, RZ, P2;
--:-:-:-:1      SEL b7, one, RZ, P3;
--:-:-:-:1      R2P PR, preds, 0x0f;
--:-:-:-:1      FFMA b4, b4, param_beta, x4;
--:-:-:-:1      FFMA b5, b5, param_beta, x5;
--:-:-:-:1      FFMA b6, b6, param_beta, x6;
--:-:-:-:1      FFMA b7, b7, param_beta, x7;
--:-:-:-:1      FMUL out4, out4, b4;
--:-:-:-:1      FMUL out5, out5, b5;
--:-:-:-:1      FMUL out6, out6, b6;
--:-:-:-:1      FMUL out7, out7, b7;
        } if $bprelu;

        $out .= q{
--:-:-:-:1 @!P2 MOV  sum0, RZ;
--:-:-:-:1 @!P3 MOV  sum2, RZ;
01:-:-:-:1  @P2 FADD sum0, out0, out1;
--:-:-:-:1  @P2 FADD sum1, out2, out3;
02:-:-:-:1  @P3 FADD sum2, out4, out5;
--:-:-:-:1  @P3 FADD sum3, out6, out7;
--:-:-:-:1  @P2 FADD sum0, sum0, sum1;
--:-:-:-:1  @P3 FADD sum2, sum2, sum3;
--:-:-:-:1      FADD sum0, sum0, sum2;
        } if $bsum;

        $out .= q{
<ORDERED>
01:-:-:-:1  @P2 F2F.F16.F32 out0, out0;
--:-:-:-:1  @P2 F2F.F16.F32 out1, out1;
--:-:-:-:1  @P2 F2F.F16.F32 out2, out2;
--:-:1:-:1  @P2 F2F.F16.F32 out3, out3;
02:-:-:-:1  @P2 F2F.F16.F32 out4, out4;
--:-:-:-:1  @P2 F2F.F16.F32 out5, out5;
--:-:-:-:1  @P2 F2F.F16.F32 out6, out6;
--:-:2:-:1  @P2 F2F.F16.F32 out7, out7;
</ORDERED>
        } if $half;

        $out .= q{
</SCHEDULE_BLOCK>
        };

        $out .= $half ? qq{
<SCHEDULE_BLOCK>
--:-:-:-:1      LEA      Out0.CC, offset, param_O[0],     $addr_shift;
--:-:-:-:1      LEA.HI.X Out1,    offset, param_O[1], RZ, $addr_shift;

01:-:-:-:1  \@P2 BFI c0, out1, 0x1010, out0;
--:-:-:-:1  \@P2 BFI c1, out3, 0x1010, out2;
02:-:-:-:1  \@P2 BFI c2, out5, 0x1010, out4;
--:-:-:-:1  \@P2 BFI c3, out7, 0x1010, out6;

--:5:-:-:1  \@P2 STG.E.CG.128 [Out], c0;
</SCHEDULE_BLOCK>
        } : qq{
<SCHEDULE_BLOCK>
--:-:-:-:1      LEA      Out0.CC, offset, param_O[0],     $addr_shift;
--:-:-:-:1      LEA.HI.X Out1,    offset, param_O[1], RZ, $addr_shift;

01:-:-:-:1  \@P2 STG.E.CG.128 [Out + 4x<00>], out0;
02:5:-:-:1  \@P3 STG.E.CG.128 [Out + 4x<$stepI>], out4;
</SCHEDULE_BLOCK>
        };

        $out .= q{
<SCHEDULE_BLOCK>
--:-:-:-:1      XMAD.LO2C offset, k, param_gridMPQN, bsum_offset;
--:-:-:-:1      LEA      Sum0.CC, offset, param_Sum[0],     2;
--:-:-:-:1      LEA.HI.X Sum1,    offset, param_Sum[1], RZ, 2;

--:-:-:-:1      ISETP.LT.AND P6, PT, k, param_K, P5; // k < K && tid31 == 0

--:-:2:-:2      SHFL.BFLY PT, sum1, sum0, 1,  0x1f;
02:-:-:-:4      FADD sum0, sum1, sum0;
--:-:2:-:2      SHFL.BFLY PT, sum1, sum0, 2,  0x1f;
02:-:-:-:4      FADD sum0, sum1, sum0;
--:-:2:-:2      SHFL.BFLY PT, sum1, sum0, 4,  0x1f;
02:-:-:-:2      FADD sum0, sum1, sum0;

--:6:-:-:1  @P6 STG.E.CG [Sum], sum0;
</SCHEDULE_BLOCK>
        } if $bsum;

        $out .= q{
--:-:-:-:5      RET;
        };
    }

-]
